%%html
<style type="text/css">
.reveal div.highlight {
margin: 0;
}
.reveal div.highlight>pre {
margin: 0;
width: 100%;
font-size: 15px;
}
.reveal div.jp-OutputArea-output>pre {
margin: 0;
width: 75%;
font-size: var(--jp-code-font-size);
box-shadow: none;
}
</style>
Der Code basiert auf:
Karsdorp, Folgert / Kestemont, Mike / Riddel, Allen, Humanities Data Analysis. Case Studies with Python, Princeton University Press 2021, S. 19-30.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2
from collections import Counter
import random
import re
import string
%matplotlib inline
df = pd.read_csv('../data/220128-wutbuerger-tokenized.csv',
parse_dates=['date'], encoding='utf8')
df.loc[:, 'year'] = df.loc[:, 'date'].dt.year
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9149 entries, 0 to 9148 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 9149 non-null int64 1 date 9149 non-null datetime64[ns, UTC] 2 tweet 9149 non-null object 3 hashtags 9149 non-null object 4 username 9149 non-null object 5 link 9149 non-null object 6 nretweets 9149 non-null int64 7 nlikes 9149 non-null int64 8 nreplies 9149 non-null int64 9 nqoutes 0 non-null float64 10 char_per_url_free_tweet 9149 non-null int64 11 tweet_clean 9149 non-null object 12 tweet_clean_removed 9149 non-null object 13 tokens 9149 non-null object 14 year 9149 non-null int64 dtypes: datetime64[ns, UTC](1), float64(1), int64(6), object(7) memory usage: 1.0+ MB
# Count keywords
def count_keywords(tokens, keywords):
tokens = [t for t in tokens if t in keywords]
counter = Counter(tokens)
return [counter.get(k, 0) for k in keywords]
def count_keywords_by(df, by, keywords, column='tokens'):
freq_matrix = df[column].str.split().apply(count_keywords, keywords=keywords) # tokens must be a list
freq_df = pd.DataFrame.from_records(freq_matrix, columns=keywords)
freq_df[by] = df[by] # copy the grouping column(s)
return freq_df.groupby(by=by).sum().sort_values(by)
# function for counting words
# see https://gist.github.com/susanli2016/69ec5333e9846044abd74268eed9d85b#file-top_unigram-py
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
keywords = get_top_n_words(df.loc[:,'tokens'], 1000)
keywords = (list(list(zip(*keywords))[0]))
df_counts = count_keywords_by(df, by='year', keywords=keywords)
df_counts.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 516 | 785 | 465 | 431 | 413 |
| afd | 0 | 0 | 0 | 1 | 6 |
| ich | 25 | 55 | 17 | 18 | 23 |
| mal | 32 | 32 | 17 | 15 | 17 |
| pegida | 0 | 0 | 0 | 0 | 58 |
| ... | ... | ... | ... | ... | ... |
| lied | 1 | 1 | 0 | 0 | 0 |
| gehören | 0 | 0 | 0 | 0 | 0 |
| mist | 1 | 2 | 1 | 0 | 1 |
| terror | 0 | 0 | 0 | 0 | 0 |
| monat | 0 | 3 | 1 | 0 | 0 |
1000 rows × 5 columns
df_key = df_counts.copy()
df_key.loc[:, 'total']= df_key.sum(axis=1)
df_key = df_key.apply(lambda x: x/x.max(), axis=1)
df_key = df_key.drop(['total'], axis=1)
df_key.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 0.221745 | 0.235382 | 0.200517 | 0.244747 | 0.244668 |
| afd | 0.000000 | 0.000000 | 0.000000 | 0.000568 | 0.003555 |
| ich | 0.010743 | 0.016492 | 0.007331 | 0.010221 | 0.013626 |
| mal | 0.013752 | 0.009595 | 0.007331 | 0.008518 | 0.010071 |
| pegida | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.034360 |
| ... | ... | ... | ... | ... | ... |
| lied | 0.000430 | 0.000300 | 0.000000 | 0.000000 | 0.000000 |
| gehören | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| mist | 0.000430 | 0.000600 | 0.000431 | 0.000000 | 0.000592 |
| terror | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| monat | 0.000000 | 0.000900 | 0.000431 | 0.000000 | 0.000000 |
999 rows × 5 columns
labels = ['vor 2015' if year < 2015 else 'nach 2015' for year in df_key.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness, _ = chi2(df_key.fillna(0), labels)
# Turn keyness values into a Series, and sort in descending order:
keyness = pd.Series(keyness, index=df_key.columns).sort_values(ascending=False)
keyness.head(15)
wort 0.129818 s21 0.115413 afd 0.095070 jahr 0.086162 rt 0.084462 wutbürger 0.053477 2010 0.050938 bremen 0.038017 nazi 0.037076 querdenker 0.033656 düringer 0.033302 stuttgart 0.032370 ber 0.029268 noafd 0.027129 mutbürger 0.026170 dtype: float64
counts = df_counts.copy()
pre_2015 = counts[counts.index < 2015].sum().rank(method='dense', pct=True)
post_2015 = counts[(counts.index > 2014)].sum().rank(method='dense', pct=True)
rankings = pd.DataFrame({'Vor_2015': pre_2015, 'Nach_2015': post_2015})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings['Nach_2015'], y=rankings['Vor_2015'],
c=rankings['Vor_2015'] - rankings['Nach_2015'],
alpha=0.7, cmap='viridis')
for i, row in rankings.loc[keyness.head(25).index].iterrows():
plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2015')
plt.ylabel('Rank nach Frequen vor 2015');
fig = px.scatter(rankings, x='Nach_2015', y='Vor_2015',
hover_name=rankings.index,
color=rankings['Vor_2015'] - rankings['Nach_2015'],
color_continuous_scale='Viridis', width=750, height=450)
fig.show()
df_A = df[df.loc[:, 'year'] < 2020]
df_A = df_A.reset_index(drop=True)
keywords_A = get_top_n_words(df_A.loc[:,'tokens'], 1000)
keywords_A = (list(list(zip(*keywords_A))[0]))
df_counts_A = count_keywords_by(df_A, by='year', keywords=keywords_A)
df_counts_A.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 516 | 785 | 465 | 431 | 413 |
| afd | 0 | 0 | 0 | 1 | 6 |
| mal | 32 | 32 | 17 | 15 | 17 |
| pegida | 0 | 0 | 0 | 0 | 58 |
| ich | 25 | 55 | 17 | 18 | 23 |
| ... | ... | ... | ... | ... | ... |
| gegenrechts | 0 | 0 | 0 | 0 | 0 |
| schüren | 0 | 0 | 0 | 1 | 0 |
| initiative | 0 | 0 | 3 | 1 | 1 |
| les | 0 | 1 | 1 | 0 | 0 |
| normal | 0 | 1 | 0 | 1 | 1 |
999 rows × 5 columns
df_key_A = df_counts_A.copy()
df_key_A.loc[:, 'total']= df_key_A.sum(axis=1)
df_key_A = df_key_A.apply(lambda x: x/x.max(), axis=1)
df_key_A = df_key_A.drop(['total'], axis=1)
df_key_A.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 0.217630 | 0.229734 | 0.193669 | 0.235005 | 0.242229 |
| afd | 0.000000 | 0.000000 | 0.000000 | 0.000545 | 0.003519 |
| mal | 0.013496 | 0.009365 | 0.007080 | 0.008179 | 0.009971 |
| pegida | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.034018 |
| ich | 0.010544 | 0.016096 | 0.007080 | 0.009815 | 0.013490 |
| ... | ... | ... | ... | ... | ... |
| gegenrechts | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| schüren | 0.000000 | 0.000000 | 0.000000 | 0.000545 | 0.000000 |
| initiative | 0.000000 | 0.000000 | 0.001249 | 0.000545 | 0.000587 |
| les | 0.000000 | 0.000293 | 0.000416 | 0.000000 | 0.000000 |
| normal | 0.000000 | 0.000293 | 0.000000 | 0.000545 | 0.000587 |
998 rows × 5 columns
labels_A = ['vor 2015' if year < 2015 else 'nach 2015' for year in df_key_A.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_A, _ = chi2(df_key_A.fillna(0), labels_A)
# Turn keyness values into a Series, and sort in descending order:
keyness_A = pd.Series(keyness_A, index=df_key_A.columns).sort_values(ascending=False)
keyness_A.head(15)
afd 0.113709 wort 0.080188 s21 0.060164 jahr 0.051791 rt 0.048140 nazi 0.038770 2010 0.031423 noafd 0.031199 bremen 0.023341 stuttgart 0.020815 besorgen 0.020531 wutbürger 0.020286 düringer 0.019523 amp 0.017205 ber 0.017022 dtype: float64
counts_A = df_counts_A.copy()
pre_2015_A = counts_A[counts_A.index < 2015].sum().rank(method='dense', pct=True)
post_2015_A = counts_A[(counts_A.index > 2015)].sum().rank(method='dense', pct=True)
rankings_A = pd.DataFrame({'Vor_2015': pre_2015_A, 'Nach_2015': post_2015_A})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_A['Nach_2015'], y=rankings_A['Vor_2015'],
c=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
alpha=0.7, cmap='viridis')
for i, row in rankings_A.loc[keyness_A.head(25).index].iterrows():
plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2015')
plt.ylabel('Rank nach Frequen vor 2015');
fig = px.scatter(rankings_A, x='Nach_2015', y='Vor_2015',
hover_name=rankings_A.index,
color=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
color_continuous_scale='Viridis', width=750, height=450)
fig.show()
fig = px.scatter(rankings_A, x='Nach_2015', y='Vor_2015',
hover_name=rankings_A.index,
color=rankings_A['Vor_2015'] - rankings_A['Nach_2015'],
color_continuous_scale='Viridis', width=750, height=450)
fig.show()
df_B = df[df.loc[:, 'year'] > 2014]
df_B = df_B.reset_index(drop=True)
keywords_B = get_top_n_words(df_B.loc[:,'tokens'], 1000)
keywords_B = (list(list(zip(*keywords_B))[0]))
df_counts_B = count_keywords_by(df_B, by='year', keywords=keywords_B)
df_counts_B.head(5).T
| year | 2015 | 2016 | 2017 | 2018 | 2019 |
|---|---|---|---|---|---|
| wutbürger | 708 | 1140 | 822 | 1283 | 883 |
| afd | 34 | 133 | 130 | 353 | 130 |
| mal | 31 | 40 | 56 | 96 | 63 |
| nazi | 23 | 28 | 23 | 106 | 108 |
| pegida | 117 | 70 | 32 | 77 | 34 |
| ... | ... | ... | ... | ... | ... |
| kritisieren | 0 | 1 | 1 | 2 | 0 |
| chef | 0 | 1 | 2 | 3 | 2 |
| schicken | 0 | 3 | 0 | 1 | 3 |
| wünsche | 0 | 1 | 1 | 5 | 1 |
| verschwörungstheorien | 2 | 2 | 0 | 1 | 0 |
1000 rows × 5 columns
df_key_B = df_counts_B.copy()
df_key_B.loc[:, 'total']= df_key_B.sum(axis=1)
df_key_B = df_key_B.apply(lambda x: x/x.max(), axis=1)
df_key_B = df_key_B.drop(['total'], axis=1)
df_key_B.head(5).T
| year | 2015 | 2016 | 2017 | 2018 | 2019 |
|---|---|---|---|---|---|
| wutbürger | 0.220629 | 0.219738 | 0.206377 | 0.128274 | 0.142649 |
| afd | 0.010595 | 0.025636 | 0.032639 | 0.035293 | 0.021002 |
| mal | 0.009660 | 0.007710 | 0.014060 | 0.009598 | 0.010178 |
| nazi | 0.007167 | 0.005397 | 0.005775 | 0.010598 | 0.017447 |
| pegida | 0.036460 | 0.013493 | 0.008034 | 0.007698 | 0.005493 |
| ... | ... | ... | ... | ... | ... |
| kritisieren | 0.000000 | 0.000193 | 0.000251 | 0.000200 | 0.000000 |
| chef | 0.000000 | 0.000193 | 0.000502 | 0.000300 | 0.000323 |
| schicken | 0.000000 | 0.000578 | 0.000000 | 0.000100 | 0.000485 |
| wünsche | 0.000000 | 0.000193 | 0.000251 | 0.000500 | 0.000162 |
| verschwörungstheorien | 0.000623 | 0.000386 | 0.000000 | 0.000100 | 0.000000 |
1000 rows × 5 columns
labels_B = ['vor 2020' if year < 2020 else 'nach 2020' for year in df_key_B.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_B, _ = chi2(df_key_B.fillna(0), labels_B)
# Turn keyness values into a Series, and sort in descending order:
keyness_B = pd.Series(keyness_B, index=df_key_B.columns).sort_values(ascending=False)
keyness_B.head(15)
querdenker 0.089637 corona 0.036915 massnahmengegner 0.030801 putin 0.029628 ander 0.024669 pegida 0.023717 covidioten 0.023609 faschist 0.023009 schwurbler 0.020800 mainstream 0.019963 nftcommunity 0.015508 rechtsradikaler 0.015052 wutbürger 0.014950 impfgegner 0.014218 protestieren 0.013236 dtype: float64
counts_B = df_counts_B.copy()
pre_2020_B = counts_B[counts_B.index < 2020].sum().rank(method='dense', pct=True)
post_2020_B = counts_B[(counts_B.index > 2019)].sum().rank(method='dense', pct=True)
rankings_B = pd.DataFrame({'Vor_2020': pre_2020_B, 'Nach_2020': post_2020_B})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_B['Nach_2020'], y=rankings_B['Vor_2020'],
c=rankings_B['Vor_2020'] - rankings_B['Nach_2020'],
alpha=0.7, cmap='viridis')
for i, row in rankings_B.loc[keyness_B.head(15).index].iterrows():
plt.annotate(i, xy=(row['Nach_2020'], row['Vor_2020']))
plt.xlabel('Rank nach Frequenz nach 2020')
plt.ylabel('Rank nach Frequen vor 2020');
fig = px.scatter(rankings_B, x='Nach_2020', y='Vor_2020',
hover_name=rankings_B.index,
color=rankings_B['Vor_2020'] - rankings_B['Nach_2020'],
color_continuous_scale='Viridis', width=750, height=450)
fig.show()
df_C = df[(df.loc[:, 'year'] < 2015) | (df.loc[:, 'year'] > 2019)]
df_C = df_C.reset_index(drop=True)
keywords_C = get_top_n_words(df_C.loc[:,'tokens'], 1000)
keywords_C = (list(list(zip(*keywords_C))[0]))
df_counts_C = count_keywords_by(df_C, by='year', keywords=keywords_C)
df_counts_C.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 516 | 785 | 465 | 431 | 413 |
| wort | 196 | 27 | 8 | 11 | 7 |
| s21 | 43 | 92 | 59 | 46 | 19 |
| ich | 25 | 55 | 17 | 18 | 23 |
| jahr | 159 | 35 | 7 | 5 | 5 |
| ... | ... | ... | ... | ... | ... |
| maskenverweigerer | 0 | 0 | 0 | 0 | 0 |
| leser | 1 | 2 | 1 | 0 | 1 |
| blödsinn | 1 | 0 | 0 | 0 | 0 |
| dummheit | 0 | 1 | 0 | 0 | 1 |
| funktionieren | 0 | 0 | 0 | 0 | 2 |
999 rows × 5 columns
df_key_C = df_counts_C.copy()
df_key_C.loc[:, 'total']= df_key_C.sum(axis=1)
df_key_C = df_key_C.apply(lambda x: x/x.max(), axis=1)
df_key_C = df_key_C.drop(['total'], axis=1)
df_key_C.head(5).T
| year | 2010 | 2011 | 2012 | 2013 | 2014 |
|---|---|---|---|---|---|
| wutbürger | 0.216625 | 0.223075 | 0.183722 | 0.230975 | 0.242087 |
| wort | 0.082284 | 0.007673 | 0.003161 | 0.005895 | 0.004103 |
| s21 | 0.018052 | 0.026144 | 0.023311 | 0.024652 | 0.011137 |
| ich | 0.010495 | 0.015629 | 0.006717 | 0.009646 | 0.013482 |
| jahr | 0.066751 | 0.009946 | 0.002766 | 0.002680 | 0.002931 |
| ... | ... | ... | ... | ... | ... |
| maskenverweigerer | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| leser | 0.000420 | 0.000568 | 0.000395 | 0.000000 | 0.000586 |
| blödsinn | 0.000420 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| dummheit | 0.000000 | 0.000284 | 0.000000 | 0.000000 | 0.000586 |
| funktionieren | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.001172 |
998 rows × 5 columns
labels_C = ['vor 2015' if year < 2020 else 'nach 2015' for year in df_key_C.index]
# replace missing values with zero (.fillna(0)),
# and compute the chi2 statistic:
keyness_C, _ = chi2(df_key_C.fillna(0), labels_C)
# Turn keyness values into a Series, and sort in descending order:
keyness_C = pd.Series(keyness_C, index=df_key_C.columns).sort_values(ascending=False)
keyness_C.head(15)
querdenker 0.085923 s21 0.060513 wutbürger 0.058647 afd 0.053915 wort 0.053183 jahr 0.040039 ander 0.038804 rt 0.036780 corona 0.035873 putin 0.030712 massnahmengegner 0.029417 faschist 0.029006 nazi 0.028433 covidioten 0.023140 schwurbler 0.021320 dtype: float64
counts_C = df_counts_C.copy()
pre_2015_C = counts_C[counts_C.index < 2015].sum().rank(method='dense', pct=True)
post_2020_C = counts_C[(counts_C.index > 2019)].sum().rank(method='dense', pct=True)
rankings_C = pd.DataFrame({'Vor_2015': pre_2015_C, 'Nach_2015': post_2020_C})
fig = plt.figure(figsize=(10, 5))
plt.scatter(x=rankings_C['Nach_2015'], y=rankings_C['Vor_2015'],
c=rankings_C['Vor_2015'] - rankings_C['Nach_2015'],
alpha=0.7, cmap='viridis')
for i, row in rankings_C.loc[keyness_C.head(25).index].iterrows():
plt.annotate(i, xy=(row['Nach_2015'], row['Vor_2015']))
plt.xlabel('Rank nach Frequenz nach 2020')
plt.ylabel('Rank nach Frequen vor 2015');
fig = px.scatter(rankings_C, x='Nach_2015', y='Vor_2015',
hover_name=rankings_C.index,
color=rankings_C['Vor_2015'] - rankings_C['Nach_2015'],
color_continuous_scale='Viridis', width=750, height=450)
fig.show()